RNP

In [2]:
from __future__ import print_function
import os.path
import pickle
import pandas as pd
import sys
sys.path.insert(0, '../../')
import numpy as np
import itertools

#import Datanalytics as da 
from JKBio import TerraFunction as terra
from JKBio import Helper as h

from taigapy import TaigaClient
import dalmatian as dm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from JKBio.helper import pyDESeq2
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
import gseapy

from sklearn.preprocessing import scale
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from bokeh.plotting import *
from bokeh.models import HoverTool
import seaborn as sns

%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython
tc = TaigaClient()
output_notebook()
Loading BokehJS ...

getting data

In [ ]:
! gsutil mv gs://transfer-amlproject/*MP7624* gs://transfer-amlproject/RNPv2/
In [ ]:
! gsutil -m cp -r gs://transfer-amlproject/RNPv3 gs://amlproject/RNA/
In [ ]:
! gsutil ls gs://amlproject/
In [3]:
sampleset='RNPv3'
In [ ]:
terra.uploadFromFolder('amlproject','RNPv3/',
                       'broad-firecloud-ccle/hg38_RNAseq',samplesetname=sampleset,
                      fformat="fastqR1R2", sep='_MP7624')

Processing

In [ ]:
wm = dm.WorkspaceManager('broad-firecloud-ccle/hg38_RNAseq')
In [ ]:
submission_id = wm.create_submission("star_v1-0_BETA_cfg", sampleset, 'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
In [ ]:
submission_id = wm.create_submission("rsem_v1-0_BETA_cfg", 
                                      sampleset,'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
In [ ]:
submission_id = wm.create_submission("rsem_aggregate_results_v1-0_BETA_cfg", 
                                         sampleset)
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
In [ ]:
results = wm.get_sample_sets().loc[sampleset]
rsem_genes_expected_count = results['rsem_genes_expected_count']
In [3]:
project = "RNPv2"
version = "scaling_2"
In [ ]:
results
In [ ]:
mkdir ../../data/$project

Loading

In [ ]:
! gsutil cp $rsem_genes_expected_count ../../data/$project/
In [7]:
file = '../../data/'+project+'/'+rsem_genes_expected_count.split('/')[-1]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-ad8b5b11ee0a> in <module>
----> 1 file = '../../data/'+project+'/'+rsem_genes_expected_count.split('/')[-1]

NameError: name 'rsem_genes_expected_count' is not defined
In [ ]:
file
In [ ]:
! gunzip $file
In [ ]:
file
In [ ]:
rsem_genes_expected_count = pd.read_csv(file[:-3], sep='\t')
In [5]:
rsem_genes_expected_count = pd.read_csv("../data/"+project+"/RNPv3.rsem_genes_expected_count.txt", sep='\t')
In [114]:
data = rsem_genes_expected_count.drop("transcript_id(s)",1)
In [115]:
data["gene_id"] = h.convertGenes(data['gene_id'])[0]
you need access to taiga for this (https://pypi.org/project/taigapy/)
20702 could not be parsed... we don't have all genes already
In [116]:
data=data.set_index('gene_id')
In [117]:
data
Out[117]:
1 10 11 12 13 14 15 16 17 18 ... 67 68 69 7 70 71 72 73 8 9
gene_id
TSPAN6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00
TNMD 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00
DPM1 1619.00 2465.00 1701.00 1535.00 1863.00 2093.00 2027.00 2202.00 2148.00 2235.00 ... 1620.00 1840.00 1729.00 1983.00 1926.0 1846.00 1915.00 2633.00 2451.00 2378.00
SCYL3 464.57 846.12 672.69 603.75 577.41 617.97 601.43 545.49 575.14 536.97 ... 430.78 460.04 437.36 542.42 572.5 507.48 580.49 713.56 670.02 576.38
C1orf112 780.43 1031.90 755.31 676.25 1232.70 1209.00 1309.60 1370.50 1245.90 1257.10 ... 949.22 1277.00 1032.60 1163.60 783.5 1088.50 1184.50 1572.40 1481.00 1332.90
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ERCC-00164 3.00 5.00 8.00 2.00 2.00 1.00 2.00 1.00 3.00 3.00 ... 1.00 1.00 5.00 1.00 6.0 3.00 3.00 4.00 2.00 4.00
ERCC-00165 215.00 594.00 424.00 509.00 136.00 88.00 165.00 258.00 161.00 163.00 ... 93.00 139.00 87.00 127.00 628.0 207.00 151.00 241.00 187.00 176.00
ERCC-00168 3.00 12.00 9.00 8.00 0.00 8.00 0.00 5.00 5.00 1.00 ... 3.00 4.00 1.00 3.00 8.0 5.00 4.00 7.00 8.00 3.00
ERCC-00170 66.00 205.00 133.00 211.00 57.00 40.00 73.00 94.00 42.00 40.00 ... 41.00 56.00 33.00 50.00 141.0 72.00 92.00 110.00 89.00 88.00
ERCC-00171 13554.00 40900.00 29090.00 33242.00 10039.00 6399.00 10836.00 15684.00 9526.00 8893.00 ... 7058.00 7576.00 5882.00 8381.00 47913.0 12046.00 10447.00 17316.00 10492.00 12389.00

58813 rows × 73 columns

In [118]:
rename = {"1": "mr120-MV411-RNP_IRF2BP2-r4",
"2": "mr121-MV411-RNP_IRF2BP2-r5",
"3": "mr122-MV411-RNP_IRF2BP2-r6",
"4": "mr123-MV411-RNP_IRF8-r4",
"5": "mr124-MV411-RNP_IRF8-r5",
"6": "mr125-MV411-RNP_IRF8-r6",
"7": "mr126-MV411-RNP_MEF2D-r4",
"8": "mr127-MV411-RNP_MEF2D-r5",
"9": "mr128-MV411-RNP_MEF2D-r6",
"10": "mr129-MV411-RNP_MYC-r4",
"11": "mr130-MV411-RNP_MYC-r5",
"12": "mr131-MV411-RNP_MYC-r6",
"13": "mr132-MV411-RNP_RUNX1-r4",
"14": "mr133-MV411-RNP_RUNX1-r5",
"15": "mr134-MV411-RNP_RUNX1-r6",
"16": "mr135-MV411-RNP_RUNX2-r4",
"17": "mr136-MV411-RNP_RUNX2-r5",
"18": "mr137-MV411-RNP_RUNX2-r6",
"19": "mr138-MV411-RNP_SPI1-r4",
"20": "mr139-MV411-RNP_SPI1-r5",
"21": "mr140-MV411-RNP_SPI1-r6",
"22": "mr141-MV411-RNP_ZMYND8-r4",
"23": "mr142-MV411-RNP_ZMYND8-r5",
"24": "mr143-MV411-RNP_ZMYND8-r6",
"25": "mr144-MV411-RNP_LMO2-r4",
"26": "mr145-MV411-RNP_LMO2-r5",
"27": "mr146-MV411-RNP_LMO2-r6",
"28": "mr147-MV411-RNP_LYL1-r4",
"29": "mr148-MV411-RNP_LYL1-r5",
"30": "mr149-MV411-RNP_LYL1-r6",
"31": "mr150-MV411-RNP_MAX-r4",
"32": "mr151-MV411-RNP_MAX-r5",
"33": "mr152-MV411-RNP_MAX-r6",
"34": "mr153-MV411-RNP_ZEB2-r4",
"35": "mr154-MV411-RNP_ZEB2-r5",
"36": "mr155-MV411-RNP_ZEB2-r6",
"37": "mr156-MV411-RNP_MEF2C-r4",
"38": "mr157-MV411-RNP_MEF2C-r5",
"39": "mr158-MV411-RNP_MEF2C-r6",
"40": "mr159-MV411-RNP_MEIS1-r4",
"41": "mr160-MV411-RNP_MEIS1-r5",
"42": "mr161-MV411-RNP_MEIS1-r6",
"43": "mr162-MV411-RNP_FLI1-r4",
"44": "mr163-MV411-RNP_FLI1-r5",
"45": "mr164-MV411-RNP_FLI1-r6",
"46": "mr165-MV411-RNP_ELF2-r4",
"47": "mr166-MV411-RNP_ELF2-r5",
"48": "mr167-MV411-RNP_ELF2-r6",
"49": "mr168-MV411-RNP_GFI1-r4",
"50": "mr169-MV411-RNP_GFI1-r5",
"51": "mr170-MV411-RNP_GFI1-r6",
"52": "mr171-MV411-RNP_IKZF1-r4",
"53": "mr172-MV411-RNP_IKZF1-r5",
"54": "mr173-MV411-RNP_IKZF1-r6",
"55": "mr174-MV411-RNP_CEBPA-r4",
"56": "mr175-MV411-RNP_CEBPA-r5",
"57": "mr176-MV411-RNP_CEBPA-r6",
"58": "mr177-MV411-RNP_MYB-r4",
"59": "mr178-MV411-RNP_MYB-r5",
"60": "mr179-MV411-RNP_MYB-r6",
"61": "mr180-MV411-RNP_MYBL2-r1",
"62": "mr181-MV411-RNP_MYBL2-r2",
"63": "mr182-MV411-RNP_MYBL2-r3",
"64": "mr183-MV411-RNP_HOXA9-r4",
"65": "mr184-MV411-RNP_HOXA9-r5",
"66": "mr185-MV411-RNP_HOXA9-r6",
"67": "mr186-MV411-RNP_AAVS1-r1",
"68": "mr187-MV411-RNP_AAVS1-r2",
"69": "mr188-MV411-RNP_AAVS1-r3",
"70": "mr189-MV411-RNP_SPI1-r7",
"71": "mr190-MV411-RNP_SP1-r4",
"72": "mr191-MV411-RNP_SP1-r5",
"73": "mr192-MV411-RNP_SP1-r6"}
In [119]:
data.columns
Out[119]:
Index(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2',
       '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3', '30',
       '31', '32', '33', '34', '35', '36', '37', '38', '39', '4', '40', '41',
       '42', '43', '44', '45', '46', '47', '48', '49', '5', '50', '51', '52',
       '53', '54', '55', '56', '57', '58', '59', '6', '60', '61', '62', '63',
       '64', '65', '66', '67', '68', '69', '7', '70', '71', '72', '73', '8',
       '9'],
      dtype='object')
In [120]:
data.columns = [rename[i] for i in data.columns]
In [ ]:
data

post processing and filtering

filter some more

In [121]:
toremove = np.argwhere(data.values.var(1)==0)
toremove.ravel()
Out[121]:
array([    1,    15,    24, ..., 58714, 58715, 58718])
In [122]:
data = data.drop(data.iloc[toremove.ravel()].index,0)
In [14]:
data.shape
Out[14]:
(38787, 73)
In [86]:
data[data.index.str.contains('ERCC')]
Out[86]:
RNP_CEBPA RNP_ELF2 RNP_FLI1 RNP_GFI1 RNP_HOXA9 RNP_IKZF1 RNP_IRF2BP2 RNP_IRF8 RNP_LMO2 RNP_LYL1 ... RNP_MEIS1 RNP_MYB RNP_MYBL2 RNP_MYC RNP_RUNX1 RNP_RUNX2 RNP_SP1 RNP_SPI1 RNP_ZEB2 RNP_ZMYND8
ERCC1 0.213129 0.303065 0.424820 -0.013338 0.720478 0.301391 -1.519144 -0.150769 0.308328 0.254115 ... 0.433164 -0.405264 0.613641 -1.966010 -0.064005 -0.234768 -0.473219 -1.465504 -0.560256 0.039887
ERCC8 -0.161144 0.310575 0.456050 0.435417 0.694854 0.659261 -1.729489 0.067545 0.340136 0.145100 ... 0.492535 -1.018353 0.625291 -2.958253 0.033777 -0.077966 -0.477258 -1.277412 -0.502424 -0.051049
ERCC2 -0.101898 0.344646 0.487830 -0.024586 0.758630 0.365316 -1.564490 0.109321 0.253974 0.158182 ... 0.560113 -0.809031 0.537718 -2.413037 0.100273 -0.100976 -0.359823 -1.383595 -0.497120 0.032041
ERCC5 0.329847 0.370887 0.445285 -0.036549 0.718265 0.355537 -1.669172 0.012511 0.333557 0.317322 ... 0.375729 -1.020325 0.716546 -2.242334 -0.008906 -0.226874 -0.556690 -1.190012 -0.412360 -0.060632
ERCC3 -0.000455 0.401906 0.428938 0.267055 0.650901 0.469671 -1.798361 -0.120519 0.321405 0.249243 ... 0.404594 -0.658176 0.629318 -2.178038 0.013789 -0.209330 -0.647685 -0.959858 -0.324506 -0.035175
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ERCC-00164 -0.837536 -2.679307 -0.155242 -3.158132 -2.314087 0.598644 0.443121 -0.031885 -0.299387 0.788603 ... -0.797601 -1.050161 0.013926 -0.054725 -0.334608 0.529373 0.632518 0.321083 -1.321885 -1.050444
ERCC-00165 -0.416577 -0.192966 -0.158998 -0.600933 -0.010283 0.022655 0.086795 -0.521364 0.063089 0.061745 ... 0.021979 0.086623 0.003628 -0.002804 -0.626433 0.179648 -0.069769 0.005323 0.027799 -0.627124
ERCC-00168 -0.631838 0.437075 0.226472 -0.943388 -0.467661 0.865111 -0.250046 -0.596701 0.118189 -0.933793 ... -0.500321 -0.848169 0.067743 0.246156 -0.213288 0.373271 0.682603 0.005824 -1.315818 -2.214663
ERCC-00170 -0.620167 -0.105858 -0.104200 -0.844148 -0.242513 -0.029162 0.170384 -0.640951 0.360669 0.120560 ... -0.086921 -0.129397 -0.093660 0.001588 -0.306315 -0.135824 0.318423 -0.235470 0.001009 -0.741242
ERCC-00171 -0.356140 0.008629 -0.082360 -0.540067 0.038664 -0.025910 0.082481 -0.595385 -0.011672 0.020116 ... 0.050009 0.028884 -0.021258 0.031320 -0.540170 -0.006482 -0.074723 0.056912 -0.022644 -0.582991

103 rows × 23 columns

In [124]:
ERCC = data[~data.index.str.contains('ENSG00')]
In [125]:
ensg = data[data.index.str.contains('ENSG00')]
In [126]:
data = data[~data.index.str.contains('ENSG00')]

renormalize the data

In [21]:
len(ERCC)
Out[21]:
26672
In [22]:
ERCC
Out[22]:
mr120-MV411-RNP_IRF2BP2-r4 mr129-MV411-RNP_MYC-r4 mr130-MV411-RNP_MYC-r5 mr131-MV411-RNP_MYC-r6 mr132-MV411-RNP_RUNX1-r4 mr133-MV411-RNP_RUNX1-r5 mr134-MV411-RNP_RUNX1-r6 mr135-MV411-RNP_RUNX2-r4 mr136-MV411-RNP_RUNX2-r5 mr137-MV411-RNP_RUNX2-r6 ... mr186-MV411-RNP_AAVS1-r1 mr187-MV411-RNP_AAVS1-r2 mr188-MV411-RNP_AAVS1-r3 mr126-MV411-RNP_MEF2D-r4 mr189-MV411-RNP_SPI1-r4 mr190-MV411-RNP_SP1-r4 mr191-MV411-RNP_SP1-r5 mr192-MV411-RNP_SP1-r6 mr127-MV411-RNP_MEF2D-r5 mr128-MV411-RNP_MEF2D-r6
gene_id
TSPAN6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 0.00 0.0 0.00 0.00 0.00 0.00 0.00
DPM1 1619.00 2465.00 1701.00 1535.00 1863.00 2093.00 2027.00 2202.00 2148.00 2235.00 ... 1620.00 1840.00 1729.00 1983.00 1926.0 1846.00 1915.00 2633.00 2451.00 2378.00
SCYL3 464.57 846.12 672.69 603.75 577.41 617.97 601.43 545.49 575.14 536.97 ... 430.78 460.04 437.36 542.42 572.5 507.48 580.49 713.56 670.02 576.38
C1orf112 780.43 1031.90 755.31 676.25 1232.70 1209.00 1309.60 1370.50 1245.90 1257.10 ... 949.22 1277.00 1032.60 1163.60 783.5 1088.50 1184.50 1572.40 1481.00 1332.90
FGR 1443.00 8556.00 6387.00 5955.00 2359.00 2615.00 2258.00 3340.00 3229.00 3466.00 ... 2323.00 2401.00 2230.00 3680.00 2016.0 2285.00 2384.00 3106.00 4706.00 4308.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ERCC-00164 3.00 5.00 8.00 2.00 2.00 1.00 2.00 1.00 3.00 3.00 ... 1.00 1.00 5.00 1.00 6.0 3.00 3.00 4.00 2.00 4.00
ERCC-00165 215.00 594.00 424.00 509.00 136.00 88.00 165.00 258.00 161.00 163.00 ... 93.00 139.00 87.00 127.00 628.0 207.00 151.00 241.00 187.00 176.00
ERCC-00168 3.00 12.00 9.00 8.00 0.00 8.00 0.00 5.00 5.00 1.00 ... 3.00 4.00 1.00 3.00 8.0 5.00 4.00 7.00 8.00 3.00
ERCC-00170 66.00 205.00 133.00 211.00 57.00 40.00 73.00 94.00 42.00 40.00 ... 41.00 56.00 33.00 50.00 141.0 72.00 92.00 110.00 89.00 88.00
ERCC-00171 13554.00 40900.00 29090.00 33242.00 10039.00 6399.00 10836.00 15684.00 9526.00 8893.00 ... 7058.00 7576.00 5882.00 8381.00 47913.0 12046.00 10447.00 17316.00 10492.00 12389.00

26672 rows × 73 columns

Loading the CRC members

In [5]:
ctf=pd.read_csv('../data/CTF.csv',header=None)[0].values.tolist()
ctf
Out[5]:
['ARID2',
 'CEBPA',
 'CEBPE',
 'E2F3',
 'FLI1',
 'FOSL2',
 'GFI1',
 'GFI1B',
 'HHEX',
 'IRF8',
 'LYL1',
 'MEF2C',
 'MEF2D',
 'MEIS1',
 'MTF1',
 'MYB',
 'MYC',
 'PLAGL2',
 'RUNX1',
 'RUNX2',
 'RXRA',
 'SETDB1',
 'SNAPC5',
 'SP1',
 'SPI1',
 'SREBF1',
 'STAT5B',
 'TERF2',
 'TFAP4',
 'ZEB2',
 'ZFPM1',
 'ZMYND8',
 'LMO2',
 'MAX',
 'ELF2',
 'ETV6',
 'HOXA9',
 'GATA2']

Correlation analysis across replicates

In [127]:
a = data.columns.tolist()
a.sort()
data =data[a]
In [128]:
%matplotlib inline
ig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(data.corr(), 
            xticklabels=data.columns,
            yticklabels=data.columns, ax=ax)
plt.savefig('../results/'+project+'/plots/correlation_'+version+'.pdf')
In [130]:
data.to_csv('../results/'+project+'/'+version+'_counts.csv')
In [6]:
data = pd.read_csv('../results/'+project+'/'+version+'_counts.csv',index_col=0)
In [131]:
%matplotlib inline
sns.clustermap(data.corr(), figsize=(20, 20))
plt.savefig('../results/'+project+'/plots/cluster_corr_count_'+version+'.pdf')
In [111]:
data.sum().tolist()
Out[111]:
[30688523.16000027,
 34169438.19000007,
 41005571.74999994,
 46084912.53000021,
 45160327.37999997,
 47341768.669999816,
 42514609.540000096,
 53887098.10999983,
 50098488.209999934,
 45113084.95000005,
 36111622.839999996,
 35557153.02000013,
 44226942.32999988,
 44972217.44999995,
 45597265.59999981,
 46973180.16999991,
 44722493.25999973,
 43388113.94999987,
 56093119.09999998,
 41192195.44999994,
 51835051.10999948,
 44630323.86999974,
 40642810.10999972,
 46705531.499999784,
 43054712.28999993,
 44869251.65999987,
 45054145.239999846,
 44455780.129999846,
 36223056.36999981,
 43574264.2799998,
 41398339.949999824,
 49867049.0299997,
 49291525.72,
 41605938.48999976,
 43148081.94999997,
 40112534.0200001,
 37580594.24999993,
 40132040.0899999,
 40010332.329999894,
 42092629.67999986,
 42759963.28999992,
 50757108.769999936,
 39484881.03000006,
 42747050.27999977,
 50293476.15999993,
 46082589.51999986,
 36142588.94999987,
 42719167.15999994,
 60568119.45000004,
 19614926.17000023,
 38045957.559999876,
 30660726.830000017,
 28941980.730000205,
 45153855.01999972,
 33219425.880000006,
 14375818.309999943,
 39201346.75999991,
 37543527.6499999,
 35604913.22999999,
 30051533.790000107,
 36132240.930000104,
 74965385.61000054,
 30798922.20000013,
 41478281.1099998,
 42887521.889999785,
 41261481.03999971,
 32843576.130000114,
 38465277.630000055,
 34187426.81000007,
 43665357.69999966,
 37198889.630000085,
 44527455.94999992,
 53400482.219999835]
In [112]:
data.shape
Out[112]:
(26672, 73)

WES REMOVE SPI1-r4 because it looks bad

In [133]:
data= data.drop(columns=['mr138-MV411-RNP_SPI1-r4'])
ERCC = ERCC.drop(columns=['mr138-MV411-RNP_SPI1-r4'])
In [134]:
%matplotlib inline
sns.clustermap(data.corr(), figsize=(20, 20))
plt.savefig('../results/'+project+'/plots/cluster_corr_count_gene_removed_'+version+'.pdf')

Making and running the dashboard

In [23]:
%%R
library('erccdashboard')
R[write to console]: Loading required package: ggplot2

R[write to console]: Loading required package: gridExtra

R[write to console]: 
Attaching package: ‘gridExtra’


R[write to console]: The following object is masked from ‘package:Biobase’:

    combine


R[write to console]: The following object is masked from ‘package:BiocGenerics’:

    combine


In [135]:
ERCC = ERCC.astype(int)
In [136]:
ERCC['Feature'] = ERCC.index
In [83]:
ERCC
------------------------------------------------------------------
NameError                        Traceback (most recent call last)
<ipython-input-83-e899fab420d0> in <module>
----> 1 ERCC

NameError: name 'ERCC' is not defined
In [30]:
sns.heatmap(np.log2(ERCC[ERCC.index.str.contains('ERCC-')][['mr186-MV411-RNP_AAVS1-r1', 'mr187-MV411-RNP_AAVS1-r2', 'mr188-MV411-RNP_AAVS1-r3','mr129-MV411-RNP_MYC-r4', 'mr189-MV411-RNP_SPI1-r4', 'mr120-MV411-RNP_IRF2BP2-r4']].values / ERCC[ERCC.index.str.contains('ERCC-')][['mr186-MV411-RNP_AAVS1-r1', 'mr187-MV411-RNP_AAVS1-r2', 'mr188-MV411-RNP_AAVS1-r3','mr129-MV411-RNP_MYC-r4', 'mr189-MV411-RNP_SPI1-r4', 'mr120-MV411-RNP_IRF2BP2-r4']].values.mean(0)+1))
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba0d5598e0>
In [7]:
experiments = list(set([i.split('-')[2] for i in data.columns[:-1]]))
experiments.remove("RNP_AAVS1")
In [40]:
#TODO: compute the mass from concentration
###################################################
### code chunk number 3: defineInputData
###################################################
%R datType = "count" # "count" for RNA-Seq data, "array" for microarray data
%R isNorm = F # flag to indicate if input expression measures are already normalized, default is FALSE 
%R -i project,version filenameRoot = paste("../",project,"/ERCCdashboard_",version, sep = "", collapse = NULL) # user defined filename prefix for results files
%R sample2Name = "AAVS1" # name for sample 2 in the experiment
%R erccmix = "Single" # name of ERCC mixture design, "RatioPair" is default
%R erccdilution = 1/100 # dilution factor used for Ambion spike-in mixtures
%R spikeVol = 1 # volume (in microliters) of diluted spike-in mixture added to total RNA mass
%R choseFDR = 0.1 # user defined false discovery rate (FDR), default is 0.05
Out[40]:
array([0.1])
In [138]:
cols = list(ERCC.columns)
cols.sort()
res={}
for val in experiments:
    d = {}
    e=0
    d.update({
        'Feature':'Feature'
    })
    for i in cols[1:]:
        if val+'-' in i:
            e+=1
            d.update({i: val.split('_')[-1]+'_'+str(e)})
    d.update({
        'mr186-MV411-RNP_AAVS1-r1': 'AAVS1_1',
        'mr187-MV411-RNP_AAVS1-r2': 'AAVS1_2',
    })
    if len(d)==6:
        d.update({
            'mr188-MV411-RNP_AAVS1-r3': 'AAVS1_3'
        })
    a = ERCC[list(d.keys())].rename(columns=d)
    a.to_csv('../data/ERCC_estimation.csv', index=None)
    val = val.split('_')[-1]
    
    torm = '../results/'+project+'/ercc_'+version+'_'+val+'.AAVS1.All.Pvals.csv'
    ! rm $torm 
    %R -i val print(val)
    %R print(sample2Name)
    %R a <- read.csv('../data/ERCC_estimation.csv')
    %R print(head(a))
    %R exDat = ''
    %R totalRNAmass <- 0.5
    try:
        %R -i val exDat = initDat(datType = datType, isNorm = isNorm, exTable = a, filenameRoot = filenameRoot, sample1Name = val, sample2Name = sample2Name, erccmix = erccmix, erccdilution = erccdilution, spikeVol = spikeVol, totalRNAmass = totalRNAmass, choseFDR = choseFDR)
        %R exDat = est_r_m(exDat)
        %R exDat = dynRangePlot(exDat)
    except Warning:
        print("failed for "+val)
        continue
    except:
        print('worked for '+val)
    %R print(summary(exDat))
    %R grid.arrange(exDat$Figures$dynRangePlot)
    %R grid.arrange(exDat$Figures$r_mPlot)
    %R grid.arrange(exDat$Figures$rangeResidPlot)
    %R -o rm rm <- exDat$Results$r_m.res$r_m.mn
    %R -o se se <- exDat$Results$r_m.res$r_m.mnse
    res[val] = (rm[0],se[0])
rm: cannot remove '../results/RNPv2/ercc_scaling_2_IRF2BP2.AAAVS1.All.Pvals.csv': No such file or directory
[1] "IRF2BP2"
[1] "AAAVS1"
   Feature IRF2BP2_1 IRF2BP2_2 IRF2BP2_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6         0         0         0        0        0        0
2     DPM1      1619      1938      2043     1620     1840     1729
3    SCYL3       464       545       564      430      460      437
4 C1orf112       780       776       908      949     1277     1032
5      FGR      1443      1587      1765     2323     2401     2230
6      CFH         3         5        15        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.IRF2BP2.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16582 transcripts remain for  analysis.
A total of 13 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00024 ERCC-00048 ERCC-00057
ERCC-00061 ERCC-00075 ERCC-00083 ERCC-00086 ERCC-00098
ERCC-00117 ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
1614.75 1750.75 2094 1704 1995 1776
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
79 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
-1.242144 

GLM log(r_m) estimate weighted s.e.:
0.2116787 

Number of ERCCs in Mix 1 dyn range:  79 

Number of ERCCs in Mix 2 dyn range:  79 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00041 ERCC-00138 ERCC-00017 ERCC-00073 ERCC-00081
ERCC-00104 ERCC-00109 ERCC-00123 ERCC-00134 ERCC-00137


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_MYB.AAAVS1.All.Pvals.csv': No such file or directory
[1] "MYB"
[1] "AAAVS1"
   Feature MYB_1 MYB_2 MYB_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6     0     0     0        0        0        0
2     DPM1  1695  1557  1288     1620     1840     1729
3    SCYL3   582   482   460      430      460      437
4 C1orf112   831   825   776      949     1277     1032
5      FGR  3674  3220  2807     2323     2401     2230
6      CFH    10    17    11        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.MYB.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16938 transcripts remain for  analysis.
A total of 19 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00109 ERCC-00117
ERCC-00137 ERCC-00138 ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
1929.75 1799 1536 1654.75 1933 1726.75
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
73 

Outlier ERCCs for GLM r_m Estimate:
ERCC-00039 ERCC-00144

GLM log(r_m) estimate:
-0.5666497 

GLM log(r_m) estimate weighted s.e.:
0.1645544 

Number of ERCCs in Mix 1 dyn range:  73 

Number of ERCCs in Mix 2 dyn range:  73 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00040 ERCC-00097 ERCC-00104 ERCC-00120 ERCC-00123
ERCC-00134 ERCC-00164 ERCC-00168 ERCC-00073


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_LYL1.AAAVS1.All.Pvals.csv': No such file or directory
[1] "LYL1"
[1] "AAAVS1"
   Feature LYL1_1 LYL1_2 LYL1_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6      0      0      0        0        0        0
2     DPM1   1954   1656   2061     1620     1840     1729
3    SCYL3    572    428    588      430      460      437
4 C1orf112   1241    952   1107      949     1277     1032
5      FGR   2786   2397   3052     2323     2401     2230
6      CFH      7     14     13        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.LYL1.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16829 transcripts remain for  analysis.
A total of 20 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00137 ERCC-00138 ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2302 1853 2252 1669 1951 1743
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
72 

Outlier ERCCs for GLM r_m Estimate:
ERCC-00144

GLM log(r_m) estimate:
0.1154768 

GLM log(r_m) estimate weighted s.e.:
0.09762555 

Number of ERCCs in Mix 1 dyn range:  72 

Number of ERCCs in Mix 2 dyn range:  72 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00097 ERCC-00134 ERCC-00168 ERCC-00073 ERCC-00123


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_MAX.AAAVS1.All.Pvals.csv': No such file or directory
[1] "MAX"
[1] "AAAVS1"
   Feature MAX_1 MAX_2 MAX_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6     0     0     0        0        0        0
2     DPM1  1811  2032  2172     1620     1840     1729
3    SCYL3   571   656   742      430      460      437
4 C1orf112  1215  1387  1393      949     1277     1032
5      FGR  3640  4163  4084     2323     2401     2230
6      CFH     9     5     3        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.MAX.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16957 transcripts remain for  analysis.
A total of 15 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00048
ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081 ERCC-00083
ERCC-00086 ERCC-00098 ERCC-00117 ERCC-00138 ERCC-00142

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2142 2502 2512 1651 1928 1725
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
77 

Outlier ERCCs for GLM r_m Estimate:
ERCC-00039

GLM log(r_m) estimate:
-0.6875484 

GLM log(r_m) estimate weighted s.e.:
0.1118295 

Number of ERCCs in Mix 1 dyn range:  77 

Number of ERCCs in Mix 2 dyn range:  77 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00123 ERCC-00134 ERCC-00168 ERCC-00041 ERCC-00073
ERCC-00104 ERCC-00109 ERCC-00137 ERCC-00156


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_IRF8.AAAVS1.All.Pvals.csv': No such file or directory
[1] "IRF8"
[1] "AAAVS1"
   Feature IRF8_1 IRF8_2 IRF8_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6      0      0      0        0        0        0
2     DPM1   2211   2243   2269     1620     1840     1729
3    SCYL3    611    621    622      430      460      437
4 C1orf112   1390   1268   1244      949     1277     1032
5      FGR   3652   3917   4442     2323     2401     2230
6      CFH     16     17     15        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.IRF8.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16800 transcripts remain for  analysis.
A total of 18 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041 ERCC-00048
ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081 ERCC-00083
ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109 ERCC-00117
ERCC-00123 ERCC-00138 ERCC-00142

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2385 2327 2453 1672.25 1957.5 1744.25
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
74 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.08350448 

GLM log(r_m) estimate weighted s.e.:
0.1106991 

Number of ERCCs in Mix 1 dyn range:  74 

Number of ERCCs in Mix 2 dyn range:  74 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00012 ERCC-00013 ERCC-00134 ERCC-00137 ERCC-00164
ERCC-00168 ERCC-00073 ERCC-00156


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_GFI1.AAAVS1.All.Pvals.csv': No such file or directory
[1] "GFI1"
[1] "AAAVS1"
   Feature GFI1_1 GFI1_2 GFI1_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6      0      0      0        0        0        0
2     DPM1   3000    984   1798     1620     1840     1729
3    SCYL3    708    258    466      430      460      437
4 C1orf112   1813    586   1037      949     1277     1032
5      FGR   2396    788   1525     2323     2401     2230
6      CFH     42     18     35        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.GFI1.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16711 transcripts remain for  analysis.
A total of 21 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00123 ERCC-00134 ERCC-00138 ERCC-00142
ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
3122 1018 1947 1690.5 1977 1757
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
71 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.0614345 

GLM log(r_m) estimate weighted s.e.:
0.1106509 

Number of ERCCs in Mix 1 dyn range:  71 

Number of ERCCs in Mix 2 dyn range:  71 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00040 ERCC-00097 ERCC-00120 ERCC-00137 ERCC-00158
ERCC-00164 ERCC-00168 ERCC-00073


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_ZEB2.AAAVS1.All.Pvals.csv': No such file or directory
[1] "ZEB2"
[1] "AAAVS1"
   Feature ZEB2_1 ZEB2_2 ZEB2_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6      0      0      0        0        0        0
2     DPM1   2361   2261   1810     1620     1840     1729
3    SCYL3    531    527    481      430      460      437
4 C1orf112   1086   1059    945      949     1277     1032
5      FGR   2523   2566   2552     2323     2401     2230
6      CFH      1      1      0        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.ZEB2.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16877 transcripts remain for  analysis.
A total of 19 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00013 ERCC-00016 ERCC-00017 ERCC-00024
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00123 ERCC-00138 ERCC-00142

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2103 2164 2008 1663 1944 1734
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
73 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
-0.1701759 

GLM log(r_m) estimate weighted s.e.:
0.1445402 

Number of ERCCs in Mix 1 dyn range:  73 

Number of ERCCs in Mix 2 dyn range:  73 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00031 ERCC-00041 ERCC-00097 ERCC-00120 ERCC-00156
ERCC-00158 ERCC-00164 ERCC-00073 ERCC-00134 ERCC-00137


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_ZMYND8.AAAVS1.All.Pvals.csv': No such file or directory
[1] "ZMYND8"
[1] "AAAVS1"
   Feature ZMYND8_1 ZMYND8_2 ZMYND8_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6        0        0        0        0        0        0
2     DPM1     2140     1697     1859     1620     1840     1729
3    SCYL3      608      551      661      430      460      437
4 C1orf112     1311     1123     1319      949     1277     1032
5      FGR     4209     3864     4504     2323     2401     2230
6      CFH        8        6        7        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.ZMYND8.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  17092 transcripts remain for  analysis.
A total of 21 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00123 ERCC-00137 ERCC-00138 ERCC-00142
ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2342 2038.25 2372 1633 1900.25 1707
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
71 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.05464554 

GLM log(r_m) estimate weighted s.e.:
0.1512365 

Number of ERCCs in Mix 1 dyn range:  71 

Number of ERCCs in Mix 2 dyn range:  71 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00040 ERCC-00120 ERCC-00134 ERCC-00168 ERCC-00073


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_SP1.AAAVS1.All.Pvals.csv': No such file or directory
[1] "SP1"
[1] "AAAVS1"
   Feature SP1_1 SP1_2 SP1_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6     0     0     0        0        0        0
2     DPM1  1846  1915  2633     1620     1840     1729
3    SCYL3   507   580   713      430      460      437
4 C1orf112  1088  1184  1572      949     1277     1032
5      FGR  2285  2384  3106     2323     2401     2230
6      CFH    13    15    15        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.SP1.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16885 transcripts remain for  analysis.
A total of 17 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00048
ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081 ERCC-00083
ERCC-00086 ERCC-00098 ERCC-00117 ERCC-00134 ERCC-00138
ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
1930 2242 2756 1662 1941 1733
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
75 

Outlier ERCCs for GLM r_m Estimate:
ERCC-00147

GLM log(r_m) estimate:
-0.403555 

GLM log(r_m) estimate weighted s.e.:
0.1372525 

Number of ERCCs in Mix 1 dyn range:  75 

Number of ERCCs in Mix 2 dyn range:  75 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00040 ERCC-00041 ERCC-00097 ERCC-00104 ERCC-00120
ERCC-00073 ERCC-00109 ERCC-00123 ERCC-00137


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_SPI1.AAAVS1.All.Pvals.csv': No such file or directory
[1] "SPI1"
[1] "AAAVS1"
   Feature SPI1_1 SPI1_2 SPI1_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6      0      1      0        0        0        0
2     DPM1   1729   2302   1926     1620     1840     1729
3    SCYL3    648    744    572      430      460      437
4 C1orf112    742   1104    783      949     1277     1032
5      FGR   1766   2458   2016     2323     2401     2230
6      CFH     22     58     15        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.SPI1.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  17089 transcripts remain for  analysis.
A total of 14 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00024 ERCC-00048 ERCC-00057
ERCC-00061 ERCC-00075 ERCC-00083 ERCC-00086 ERCC-00098
ERCC-00104 ERCC-00117 ERCC-00123 ERCC-00142

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2121 2742 2076 1633 1901 1707
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
78 

Outlier ERCCs for GLM r_m Estimate:
ERCC-00097 ERCC-00033

GLM log(r_m) estimate:
-1.037793 

GLM log(r_m) estimate weighted s.e.:
0.2478424 

Number of ERCCs in Mix 1 dyn range:  78 

Number of ERCCs in Mix 2 dyn range:  78 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00097 ERCC-00109 ERCC-00134 ERCC-00137 ERCC-00138
ERCC-00017 ERCC-00041 ERCC-00073 ERCC-00081 ERCC-00156


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_RUNX2.AAAVS1.All.Pvals.csv': No such file or directory
[1] "RUNX2"
[1] "AAAVS1"
   Feature RUNX2_1 RUNX2_2 RUNX2_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6       0       0       0        0        0        0
2     DPM1    2202    2148    2235     1620     1840     1729
3    SCYL3     545     575     536      430      460      437
4 C1orf112    1370    1245    1257      949     1277     1032
5      FGR    3340    3229    3466     2323     2401     2230
6      CFH      16      12      14        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.RUNX2.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  17042 transcripts remain for  analysis.
A total of 20 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00109 ERCC-00117
ERCC-00123 ERCC-00137 ERCC-00138 ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2369.75 2268 2240.75 1638.75 1908.5 1710.75
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
72 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
-0.2177111 

GLM log(r_m) estimate weighted s.e.:
0.1235403 

Number of ERCCs in Mix 1 dyn range:  72 

Number of ERCCs in Mix 2 dyn range:  72 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00073 ERCC-00097 ERCC-00134 ERCC-00104


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_MEF2C.AAAVS1.All.Pvals.csv': No such file or directory
[1] "MEF2C"
[1] "AAAVS1"
   Feature MEF2C_1 MEF2C_2 MEF2C_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6       0       0       0        0        0        0
2     DPM1    1877    1951    1803     1620     1840     1729
3    SCYL3     459     498     519      430      460      437
4 C1orf112    1127    1049    1138      949     1277     1032
5      FGR    2652    3037    2824     2323     2401     2230
6      CFH       3       7       5        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.MEF2C.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16818 transcripts remain for  analysis.
A total of 21 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00134 ERCC-00137 ERCC-00138 ERCC-00142
ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
1959.75 2084 2098.75 1669.75 1953.75 1743.75
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
71 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.2163322 

GLM log(r_m) estimate weighted s.e.:
0.1600957 

Number of ERCCs in Mix 1 dyn range:  71 

Number of ERCCs in Mix 2 dyn range:  71 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00013 ERCC-00097 ERCC-00123 ERCC-00164 ERCC-00168
ERCC-00073


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_HOXA9.AAAVS1.All.Pvals.csv': No such file or directory
[1] "HOXA9"
[1] "AAAVS1"
   Feature HOXA9_1 HOXA9_2 HOXA9_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6       0       0       0        0        0        0
2     DPM1    1842    2075    2081     1620     1840     1729
3    SCYL3     516     575     602      430      460      437
4 C1orf112    1174    1241    1190      949     1277     1032
5      FGR    2239    2364    2372     2323     2401     2230
6      CFH       4      10       8        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.HOXA9.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16777 transcripts remain for  analysis.
A total of 21 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00123 ERCC-00137 ERCC-00138 ERCC-00142
ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2114 2247 2145 1675 1962 1750
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
71 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.4186265 

GLM log(r_m) estimate weighted s.e.:
0.1449086 

Number of ERCCs in Mix 1 dyn range:  71 

Number of ERCCs in Mix 2 dyn range:  71 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00040 ERCC-00073 ERCC-00097 ERCC-00120 ERCC-00134
ERCC-00147 ERCC-00164


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_MYC.AAAVS1.All.Pvals.csv': No such file or directory
[1] "MYC"
[1] "AAAVS1"
   Feature MYC_1 MYC_2 MYC_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6     0     0     0        0        0        0
2     DPM1  2465  1701  1535     1620     1840     1729
3    SCYL3   846   672   603      430      460      437
4 C1orf112  1031   755   676      949     1277     1032
5      FGR  8556  6387  5955     2323     2401     2230
6      CFH     5     1     2        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.MYC.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  17015 transcripts remain for  analysis.
A total of 11 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00024 ERCC-00048 ERCC-00057
ERCC-00061 ERCC-00075 ERCC-00083 ERCC-00098 ERCC-00117
ERCC-00142

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2374 1836.5 1790.5 1643 1913.5 1714
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
81 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
-1.430577 

GLM log(r_m) estimate weighted s.e.:
0.1054966 

Number of ERCCs in Mix 1 dyn range:  81 

Number of ERCCs in Mix 2 dyn range:  81 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00041 ERCC-00017 ERCC-00073 ERCC-00081 ERCC-00086
ERCC-00104 ERCC-00109 ERCC-00123 ERCC-00134 ERCC-00137
ERCC-00138 ERCC-00156


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_IKZF1.AAAVS1.All.Pvals.csv': No such file or directory
[1] "IKZF1"
[1] "AAAVS1"
   Feature IKZF1_1 IKZF1_2 IKZF1_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6       0       0       0        0        0        0
2     DPM1    1299    1529    2015     1620     1840     1729
3    SCYL3     361     406     571      430      460      437
4 C1orf112     836     967    1213      949     1277     1032
5      FGR    2082    1867    3154     2323     2401     2230
6      CFH       4       6       5        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.IKZF1.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16752 transcripts remain for  analysis.
A total of 22 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00013 ERCC-00016 ERCC-00017 ERCC-00024
ERCC-00041 ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075
ERCC-00081 ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104
ERCC-00109 ERCC-00117 ERCC-00123 ERCC-00137 ERCC-00138
ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
1576.25 1526 2348.25 1677 1966.25 1753
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
70 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.2648816 

GLM log(r_m) estimate weighted s.e.:
0.1193648 

Number of ERCCs in Mix 1 dyn range:  70 

Number of ERCCs in Mix 2 dyn range:  70 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00031 ERCC-00040 ERCC-00073 ERCC-00097 ERCC-00134
ERCC-00158 ERCC-00164 ERCC-00168


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_RUNX1.AAAVS1.All.Pvals.csv': No such file or directory
[1] "RUNX1"
[1] "AAAVS1"
   Feature RUNX1_1 RUNX1_2 RUNX1_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6       0       0       0        0        0        0
2     DPM1    1863    2093    2027     1620     1840     1729
3    SCYL3     577     617     601      430      460      437
4 C1orf112    1232    1209    1309      949     1277     1032
5      FGR    2359    2615    2258     2323     2401     2230
6      CFH       8       9       7        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.RUNX1.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  17128 transcripts remain for  analysis.
A total of 22 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00123 ERCC-00134 ERCC-00137 ERCC-00138
ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2247.25 2328 2294.25 1629 1895 1703
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
70 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.01362734 

GLM log(r_m) estimate weighted s.e.:
0.1356891 

Number of ERCCs in Mix 1 dyn range:  70 

Number of ERCCs in Mix 2 dyn range:  70 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00031 ERCC-00097 ERCC-00120 ERCC-00168 ERCC-00073


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_CEBPA.AAAVS1.All.Pvals.csv': No such file or directory
[1] "CEBPA"
[1] "AAAVS1"
   Feature CEBPA_1 CEBPA_2 CEBPA_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6       0       0       0        0        0        0
2     DPM1    1418     547    1781     1620     1840     1729
3    SCYL3     459     177     589      430      460      437
4 C1orf112     908     426    1171      949     1277     1032
5      FGR    1659     648    1791     2323     2401     2230
6      CFH       7       1      10        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.CEBPA.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16595 transcripts remain for  analysis.
A total of 22 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00013 ERCC-00016 ERCC-00017 ERCC-00024
ERCC-00041 ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075
ERCC-00081 ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104
ERCC-00109 ERCC-00117 ERCC-00134 ERCC-00137 ERCC-00138
ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
1763 743 2081.5 1704 1993.5 1775
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
70 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.04728101 

GLM log(r_m) estimate weighted s.e.:
0.2244516 

Number of ERCCs in Mix 1 dyn range:  70 

Number of ERCCs in Mix 2 dyn range:  70 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00067 ERCC-00073 ERCC-00097 ERCC-00120 ERCC-00123
ERCC-00147 ERCC-00158 ERCC-00164 ERCC-00168


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_MEF2D.AAAVS1.All.Pvals.csv': No such file or directory
[1] "MEF2D"
[1] "AAAVS1"
   Feature MEF2D_1 MEF2D_2 MEF2D_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6       0       0       0        0        0        0
2     DPM1    1983    2451    2378     1620     1840     1729
3    SCYL3     542     670     576      430      460      437
4 C1orf112    1163    1481    1332      949     1277     1032
5      FGR    3680    4706    4308     2323     2401     2230
6      CFH      17      12      14        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.MEF2D.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  17024 transcripts remain for  analysis.
A total of 17 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00048
ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081 ERCC-00083
ERCC-00086 ERCC-00098 ERCC-00109 ERCC-00117 ERCC-00123
ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2150.25 2742.25 2546 1642 1913 1713
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
75 

Outlier ERCCs for GLM r_m Estimate:
ERCC-00144

GLM log(r_m) estimate:
-0.08897608 

GLM log(r_m) estimate weighted s.e.:
0.1281747 

Number of ERCCs in Mix 1 dyn range:  75 

Number of ERCCs in Mix 2 dyn range:  75 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00041 ERCC-00134 ERCC-00073 ERCC-00104 ERCC-00137
ERCC-00138


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_ELF2.AAAVS1.All.Pvals.csv': No such file or directory
[1] "ELF2"
[1] "AAAVS1"
   Feature ELF2_1 ELF2_2 ELF2_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6      0      0      0        0        0        0
2     DPM1   2516   1913   1971     1620     1840     1729
3    SCYL3    640    486    584      430      460      437
4 C1orf112   1315   1056   1278      949     1277     1032
5      FGR   3206   2242   2711     2323     2401     2230
6      CFH      4      8      5        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.ELF2.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16904 transcripts remain for  analysis.
A total of 21 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00134 ERCC-00137 ERCC-00138 ERCC-00142
ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2414.75 1863 2194 1658.5 1938.5 1731
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
71 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.1883588 

GLM log(r_m) estimate weighted s.e.:
0.1001319 

Number of ERCCs in Mix 1 dyn range:  71 

Number of ERCCs in Mix 2 dyn range:  71 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00040 ERCC-00073 ERCC-00120 ERCC-00123 ERCC-00164


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_FLI1.AAAVS1.All.Pvals.csv': No such file or directory
[1] "FLI1"
[1] "AAAVS1"
   Feature FLI1_1 FLI1_2 FLI1_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6      0      0      0        0        0        0
2     DPM1   1892   2087   2588     1620     1840     1729
3    SCYL3    450    555    668      430      460      437
4 C1orf112   1196   1338   1591      949     1277     1032
5      FGR   2480   2602   3360     2323     2401     2230
6      CFH      3      3      4        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.FLI1.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16821 transcripts remain for  analysis.
A total of 21 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00123 ERCC-00137 ERCC-00138 ERCC-00142
ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2055 2218 2616 1669 1953 1743
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
71 

Outlier ERCCs for GLM r_m Estimate:
ERCC-00039 ERCC-00019

GLM log(r_m) estimate:
0.2669788 

GLM log(r_m) estimate weighted s.e.:
0.08613995 

Number of ERCCs in Mix 1 dyn range:  71 

Number of ERCCs in Mix 2 dyn range:  71 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00013 ERCC-00097 ERCC-00120 ERCC-00134 ERCC-00164
ERCC-00073


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_MEIS1.AAAVS1.All.Pvals.csv': No such file or directory
[1] "MEIS1"
[1] "AAAVS1"
   Feature MEIS1_1 MEIS1_2 MEIS1_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6       0       0       0        0        0        0
2     DPM1    1916    2046    2726     1620     1840     1729
3    SCYL3     477     554     683      430      460      437
4 C1orf112    1121    1128    1408      949     1277     1032
5      FGR    1935    2193    2556     2323     2401     2230
6      CFH       7       3      12        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.MEIS1.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16907 transcripts remain for  analysis.
A total of 21 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00117
ERCC-00123 ERCC-00134 ERCC-00137 ERCC-00138 ERCC-00142
ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2115 2194 2639.5 1658 1938 1730
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
71 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.2082356 

GLM log(r_m) estimate weighted s.e.:
0.1646045 

Number of ERCCs in Mix 1 dyn range:  71 

Number of ERCCs in Mix 2 dyn range:  71 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00097 ERCC-00164 ERCC-00168 ERCC-00073 ERCC-00109


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_MYBL2.AAAVS1.All.Pvals.csv': No such file or directory
[1] "MYBL2"
[1] "AAAVS1"
   Feature MYBL2_1 MYBL2_2 MYBL2_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6       0       0       0        0        0        0
2     DPM1    1881    3921    1347     1620     1840     1729
3    SCYL3     469    1039     389      430      460      437
4 C1orf112    1108    2192     863      949     1277     1032
5      FGR    2573    5804    2117     2323     2401     2230
6      CFH      18      18       8        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.MYBL2.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  17053 transcripts remain for  analysis.
A total of 21 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00109
ERCC-00117 ERCC-00123 ERCC-00137 ERCC-00138 ERCC-00142
ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
1865 3829 1543 1638 1906 1710
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
71 

Outlier ERCCs for GLM r_m Estimate:
ERCC-00039

GLM log(r_m) estimate:
0.4145379 

GLM log(r_m) estimate weighted s.e.:
0.109987 

Number of ERCCs in Mix 1 dyn range:  71 

Number of ERCCs in Mix 2 dyn range:  71 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00013 ERCC-00031 ERCC-00073 ERCC-00077 ERCC-00097
ERCC-00120 ERCC-00134 ERCC-00147 ERCC-00158 ERCC-00168


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
rm: cannot remove '../results/RNPv2/ercc_scaling_2_LMO2.AAAVS1.All.Pvals.csv': No such file or directory
[1] "LMO2"
[1] "AAAVS1"
   Feature LMO2_1 LMO2_2 LMO2_3 AAAVS1_1 AAAVS1_2 AAAVS1_3
1   TSPAN6      0      0      0        0        0        0
2     DPM1   1907   2199   2141     1620     1840     1729
3    SCYL3    561    592    644      430      460      437
4 C1orf112   1229   1188   1285      949     1277     1032
5      FGR   2777   3265   2969     2323     2401     2230
6      CFH     13      8     10        6        5        9

Initializing the exDat list structure...
choseFDR = 0.1 
repNormFactor is NULL 
Filename root is: ../RNPv2/ERCCdashboard_scaling_2.LMO2.AAAVS1 

Transcripts were removed with a mean count < 1 or more than 2 
replicates with 0 counts.
Original data contained  26672 transcripts. 
After filtering  16882 transcripts remain for  analysis.
A total of 20 out of 92 
ERCC controls were filtered from the data set
The excluded ERCCs are:
ERCC-00012 ERCC-00016 ERCC-00017 ERCC-00024 ERCC-00041
ERCC-00048 ERCC-00057 ERCC-00061 ERCC-00075 ERCC-00081
ERCC-00083 ERCC-00086 ERCC-00098 ERCC-00104 ERCC-00117
ERCC-00123 ERCC-00134 ERCC-00138 ERCC-00142 ERCC-00156

repNormFactor is NULL,
 Using Default Upper Quartile Normalization Method  - 75th percentile

normVec:
2221.75 2325 2312.5 1662 1942.5 1733
Check for sample mRNA fraction differences(r_m)...

Number of ERCC Controls Used in r_m estimate
72 

Outlier ERCCs for GLM r_m Estimate:
None 

GLM log(r_m) estimate:
0.2066036 

GLM log(r_m) estimate weighted s.e.:
0.1053062 

Number of ERCCs in Mix 1 dyn range:  72 

Number of ERCCs in Mix 2 dyn range:  72 
These ERCCs were not included in the signal-abundance plot,
because not enough non-zero replicate measurements of these
 controls were obtained for both samples:

ERCC-00120 ERCC-00137 ERCC-00158 ERCC-00164 ERCC-00168
ERCC-00073 ERCC-00109


Saving dynRangePlot to exDat
              Length Class      Mode     
sampleInfo    11     -none-     list     
plotInfo       9     -none-     list     
erccInfo       4     -none-     list     
Transcripts    7     data.frame list     
designMat      3     data.frame list     
sampleNames    2     -none-     character
idCols         6     data.frame list     
normERCCDat    7     data.frame list     
normFactor     6     -none-     numeric  
mnLibeFactor   1     -none-     numeric  
spikeFraction  1     -none-     numeric  
idColsAdj      6     data.frame list     
Results        4     -none-     list     
Figures        3     -none-     list     
In [139]:
df= pd.DataFrame(data=res.values(),index=res.keys(), columns=['scaling','var'])
df['RNPs']=df.index
sns.barplot("RNPs","scaling",data=df,ci=None,)
plt.errorbar(x=range(0,len(df)),y=df['scaling'],
            yerr=df['var'], fmt='none', c= 'r')
plt.xticks(rotation=60,ha='right')
plt.savefig('../results/'+project+"/plots/"+version+"_scaling_fact_with_conf.pdf")
In [99]:
scaling
Out[99]:
{'IRF2BP2': [-1.2421436514123199, 0.2116786922337],
 'MYB': [-0.5666496866194601, 0.16455438308564643],
 'LYL1': [0.11547676609947306, 0.09762554626023551],
 'MAX': [-0.6875484167700773, 0.11182951672314183],
 'IRF8': [0.08350447764203282, 0.11069908626789565],
 'GFI1': [0.061434499699685764, 0.11065088877815657],
 'ZEB2': [-0.1701758517854591, 0.1445402147201962],
 'ZMYND8': [0.05464554271508272, 0.1512365231509835],
 'SP1': [-0.4035549545688615, 0.13725252591104356],
 'SPI1': [-1.0377933217786204, 0.24784235774321237],
 'RUNX2': [-0.21771114300468575, 0.12354032980074721],
 'MEF2C': [0.21633221486591706, 0.16009568270385865],
 'HOXA9': [0.41862648305962474, 0.14490862380188851],
 'MYC': [-1.430576835252246, 0.10549660323839703],
 'IKZF1': [0.26488156665796003, 0.11936483909099824],
 'RUNX1': [0.013627339651964025, 0.1356890688647267],
 'CEBPA': [0.04728101063315868, 0.22445160295741662],
 'MEF2D': [-0.08897607523943744, 0.12817467579731256],
 'ELF2': [0.18835876643089494, 0.10013191844645487],
 'FLI1': [0.2669788365781275, 0.08613995212995244],
 'MEIS1': [0.20823559991440868, 0.16460447494728012],
 'MYBL2': [0.4145378723566837, 0.10998698893732116],
 'LMO2': [0.2066036480588095, 0.10530622574043316]}
In [98]:
for i, v in scaling.items():
    if abs(v[0]) > v[1]:
        print(i, v[0])
IRF2BP2 -1.2421436514123199
MYB -0.5666496866194601
LYL1 0.11547676609947306
MAX -0.6875484167700773
ZEB2 -0.1701758517854591
SP1 -0.4035549545688615
SPI1 -1.0377933217786204
RUNX2 -0.21771114300468575
MEF2C 0.21633221486591706
HOXA9 0.41862648305962474
MYC -1.430576835252246
IKZF1 0.26488156665796003
ELF2 0.18835876643089494
FLI1 0.2669788365781275
MEIS1 0.20823559991440868
MYBL2 0.4145378723566837
LMO2 0.2066036480588095
In [79]:
ERCC[ERCC.index.str.contains('ERCC-')][[i for i in ERCC.columns if 'AAVS1' in i]].mean()
Out[79]:
mr186-MV411-RNP_AAVS1-r1    2705.054348
mr187-MV411-RNP_AAVS1-r2    3576.510870
mr188-MV411-RNP_AAVS1-r3    2621.956522
dtype: float64
In [80]:
ERCC[ERCC.index.str.contains('ERCC-')][[i for i in ERCC.columns if 'SPI1' in i]].mean()
Out[80]:
mr138-MV411-RNP_SPI1-r4    34945.043478
mr139-MV411-RNP_SPI1-r5     8218.032609
mr140-MV411-RNP_SPI1-r6     8112.847826
mr189-MV411-RNP_SPI1-r4    17363.086957
dtype: float64
In [142]:
scaling = res
In [143]:
h.dictToFile(scaling,"../results/"+project+"/"+version+"_scaling.json")
In [97]:
scaling = h.fileToDict("../results/"+project+"/"+version+"_scaling.json")
In [ ]:
scaling

Differential expression analysis

In [145]:
data['gene_id'] = data.index
In [ ]:
housekeeping = ["C1orf43", "CHMP2A", "EMC7", "GPI", "PSMB2", "PSMB4", "RAB7A", "REEP5", "SNRPD3", "VCP", "VPS29"]
# from https://www.cell.com/trends/genetics/pdf/S0168-9525(13)00089-9.pdf
In [147]:
#results = {}
In [148]:
for val in experiments:  
    print(val)
    design = pd.DataFrame(index=data.columns[:-1], columns=['DMSO','Target'], 
                          data=np.array([[1 if 'RNP_AAVS1' in i else 0 for i in data.columns[:-1]],[1 if val+'-' in i else 0 for i in data.columns[:-1]]]).T)
    design.index = design.index.astype(str).str.replace('-','.')
    deseq = pyDESeq2.pyDESeq2(count_matrix=data, design_matrix = design, 
                              design_formula='~DMSO + Target', gene_column="gene_id")
    if abs(scaling[val.split('_')[1]][0]) > scaling[val.split('_')[1]][1]:
        print("estimating sizeFactors for this one")
        deseq.run_estimate_size_factors(controlGenes=data.gene_id.str.contains("ERCC-"))
    elif val in results:
        continue
    deseq.run_deseq()
    deseq.get_deseq_result()
    r = deseq.deseq_result
    r.pvalue = np.nan_to_num(np.array(r.pvalue), 1)
    r.log2FoldChange = np.nan_to_num(np.array(r.log2FoldChange), 0)
    results[val] = r
RNP_IRF2BP2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 147 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MYB
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 132 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_LYL1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 157 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MAX
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 160 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_IRF8
3.3.2
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 215 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_GFI1
3.3.2
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 212 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_ZEB2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 103 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_ZMYND8
3.3.2
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 210 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_SP1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 160 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_SPI1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 139 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_RUNX2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 156 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MEF2C
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 153 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_HOXA9
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 154 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MYC
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 159 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_IKZF1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 154 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_RUNX1
3.3.2
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 214 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_CEBPA
3.3.2
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 215 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MEF2D
3.3.2
R[write to console]: estimating size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 214 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_ELF2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 157 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_FLI1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 157 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MEIS1
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 158 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_MYBL2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 162 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

RNP_LMO2
3.3.2
estimating sizeFactors for this one
R[write to console]: using pre-existing size factors

R[write to console]: estimating dispersions

R[write to console]: gene-wise dispersion estimates

R[write to console]: mean-dispersion relationship

R[write to console]: final dispersion estimates

R[write to console]: fitting model and testing

R[write to console]: -- replacing outliers and refitting for 157 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)

R[write to console]: estimating dispersions

R[write to console]: fitting model and testing

volcano plot with CRC members highlighted

In [ ]:
results
In [149]:
for val in experiments:
    a = h.volcano(results[val],tohighlight=ctf,title=val, maxvalue= 60, searchbox=True, showlabels=True,folder="../results/"+project+"/plots/"+version+'_')
    try:
        show(a)
    except RuntimeError:
        show(a)
In [153]:
ls ../results/$project/plots/
cluster_corr_count_gene_removed_scaling_2.pdf
cluster_corr_count.pdf
cluster_corr_count_scaling_2.pdf
clustermap_ctf_deseq_all_scaled.pdf
clustermap_ctf_deseq.pdf
correlation_scaling_2.pdf
enriched_terms_scaled_gsea.pdf
enriched_terms_scaled_gsea.png
enriched_terms_scaled.png
scaling_2_RNP_CEBPA_volcano.pdf
scaling_2_RNP_ELF2_volcano.pdf
scaling_2_RNP_FLI1_volcano.pdf
scaling_2_RNP_GFI1_volcano.pdf
scaling_2_RNP_HOXA9_volcano.pdf
scaling_2_RNP_IKZF1_volcano.pdf
scaling_2_RNP_IRF2BP2_volcano.pdf
scaling_2_RNP_IRF8_volcano.pdf
scaling_2_RNP_LMO2_volcano.pdf
scaling_2_RNP_LYL1_volcano.pdf
scaling_2_RNP_MAX_volcano.pdf
scaling_2_RNP_MEF2C_volcano.pdf
scaling_2_RNP_MEF2D_volcano.pdf
scaling_2_RNP_MEIS1_volcano.pdf
scaling_2_RNP_MYBL2_volcano.pdf
scaling_2_RNP_MYB_volcano.pdf
scaling_2_RNP_MYC_volcano.pdf
scaling_2_RNP_RUNX1_volcano.pdf
scaling_2_RNP_RUNX2_volcano.pdf
scaling_2_RNP_SP1_volcano.pdf
scaling_2_RNP_SPI1_volcano.pdf
scaling_2_RNP_ZEB2_volcano.pdf
scaling_2_RNP_ZMYND8_volcano.pdf
scaling_2_scaling_fact_with_conf.pdf
transcriptome correlation.html
In [152]:
!mkdir ../results/$project/deseq_$version
for k, val in results.items():
    val.to_csv('../results/'+project+'/deseq_'+version+'/'+k+".csv")
In [49]:
results = {}
des = ! ls ../results/$project/deseq_$version/*.csv
for val in des:
    results["RNP_"+val.split('RNP_')[-1].split('.')[0]] = pd.read_csv(val,index_col=0)

Making the ccsv file for max

In [50]:
des
Out[50]:
['../results/RNPv2/deseq_scaling_2/RNP_RNP_CEBPA.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_ELF2.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_FLI1.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_GFI1.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_HOXA9.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_IKZF1.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_IRF2BP2.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_IRF8.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_LMO2.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_LYL1.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_MAX.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_MEF2C.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_MEF2D.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_MEIS1.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_MYB.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_MYBL2.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_MYC.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_RUNX1.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_RUNX2.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_SP1.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_SPI1.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_ZEB2.csv',
 '../results/RNPv2/deseq_scaling_2/RNP_RNP_ZMYND8.csv']
In [52]:
results.keys()
Out[52]:
dict_keys(['RNP_CEBPA', 'RNP_ELF2', 'RNP_FLI1', 'RNP_GFI1', 'RNP_HOXA9', 'RNP_IKZF1', 'RNP_IRF2BP2', 'RNP_IRF8', 'RNP_LMO2', 'RNP_LYL1', 'RNP_MAX', 'RNP_MEF2C', 'RNP_MEF2D', 'RNP_MEIS1', 'RNP_MYB', 'RNP_MYBL2', 'RNP_MYC', 'RNP_RUNX1', 'RNP_RUNX2', 'RNP_SP1', 'RNP_SPI1', 'RNP_ZEB2', 'RNP_ZMYND8'])
In [51]:
results.pop('RNP_all')
------------------------------------------------------------------
KeyError                         Traceback (most recent call last)
<ipython-input-51-48bc20ddc420> in <module>
----> 1 results.pop('RNP_all')

KeyError: 'RNP_all'
In [154]:
tosave = pd.DataFrame(index=results['RNP_CEBPA'].index)
for k,v in results.items():
    tosave[k+'_fc_log2'] = v.log2FoldChange
    tosave[k+'_padj'] = v.padj
    tosave[k+'_pval'] = v.pvalue
In [155]:
tosave.to_csv('../results/'+project+'/deseq_RNP_all_'+version+'.csv')

Looking at CRC members only

In [7]:
ctf.extend(['IRF2BP2','MYBL2','IKZF1'])

maybe use adjusted p_value

In [157]:
deseq = pd.DataFrame(index=ctf)
for k, val in results.items():
    deseq[k] = [i.log2FoldChange if i.pvalue<0.05 else 0 for a, i in val.loc[ctf].iterrows()]
In [ ]:
deseq
In [158]:
fig = sns.clustermap(figsize=(25,20), cmap=plt.cm.RdYlBu, data=deseq,vmin=-1,vmax=1,xticklabels=deseq.columns, yticklabels=deseq.index)
fig.savefig('../results/'+project+'/plots/clustermap_ctf_deseq_all_scaled_'+version+'.pdf')
In [159]:
deseq.columns = [i.split('_')[1] for i in deseq.columns]
deseq = deseq.loc[deseq.columns]
In [160]:
deseq.to_csv('../results/'+project+'/deseq_CTFmat'+version+'.csv')
In [24]:
deseq = pd.read_csv('../results/'+project+'/deseq_CTFmat'+version+'.csv',index_col=0)
------------------------------------------------------------------
FileNotFoundError                Traceback (most recent call last)
<ipython-input-24-2ed75ac54d23> in <module>
----> 1 deseq = pd.read_csv('../results/'+project+'/deseq_CTFmat'+version+'.csv',index_col=0)

~/.local/lib/python3.8/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    674         )
    675 
--> 676         return _read(filepath_or_buffer, kwds)
    677 
    678     parser_f.__name__ = name

~/.local/lib/python3.8/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    446 
    447     # Create the parser.
--> 448     parser = TextFileReader(fp_or_buf, **kwds)
    449 
    450     if chunksize or iterator:

~/.local/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    878             self.options["has_index_names"] = kwds["has_index_names"]
    879 
--> 880         self._make_engine(self.engine)
    881 
    882     def close(self):

~/.local/lib/python3.8/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1112     def _make_engine(self, engine="c"):
   1113         if engine == "c":
-> 1114             self._engine = CParserWrapper(self.f, **self.options)
   1115         else:
   1116             if engine == "python":

~/.local/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1889         kwds["usecols"] = self.usecols
   1890 
-> 1891         self._reader = parsers.TextReader(src, **kwds)
   1892         self.unnamed_cols = self._reader.unnamed_cols
   1893 

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: [Errno 2] File ../results/RNPv2/deseq_CTFmatscaling_2.csv does not exist: '../results/RNPv2/deseq_CTFmatscaling_2.csv'
In [175]:
net = nx.from_pandas_adjacency(((deseq < -0.3) | (deseq > 0.3)).T,create_using=nx.DiGraph)
In [176]:
pos = nx.nx_agraph.graphviz_layout(net, prog="neato")
In [177]:
colors = ['red' if deseq.loc[i[1],i[0]]> 0 else 'blue' for i in net.edges]

blue is down, red is up

In [178]:
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True,edge_color=colors)
plt.show()
plt.savefig("../results/"+project+"/plots/"+version+"_interaction_graph_binary.pdf")
<Figure size 432x288 with 0 Axes>
In [179]:
deseq[(deseq > -0.3) & (deseq < 0.3)]=0
net = nx.from_pandas_adjacency(deseq.T,create_using=nx.DiGraph)
In [180]:
pos = nx.nx_agraph.graphviz_layout(net, prog='dot')
In [181]:
colors = [-deseq.loc[i[1],i[0]] for i in net.edges]
In [182]:
colors = [i/-min(colors) if i <0 else i/max(colors) for i in colors]
In [183]:
plt.figure(figsize=(8, 8))
nx.draw(net,pos,with_labels=True, edge_color=colors,edge_cmap=plt.cm.RdYlBu)
plt.show()
plt.savefig("../results/"+project+"/plots/"+version+"_interaction_graph_continuous.pdf")
<Figure size 432x288 with 0 Axes>

We are looking for bias in the data and the replicates

In [184]:
col = {v:i for i, v in enumerate(set([i.split('-')[2] for i in data.columns[:-1]]))}
In [192]:
red = PCA(2).fit_transform(data[data.columns[:-1]].T)
h.scatter(red, labels=data.columns[:-1], title="PCA plot across replicates", radi=60000, colors=[col[i.split('-')[2]] for i in data.columns[:-1]], folder= "../results/"+project+"/plots/"+version+"_")
Out[192]:
Figure(
id = '9327', …)
In [195]:
red = PCA(30).fit_transform(data[data.columns[:-1]].T)
red = TSNE(2,4).fit_transform(red)
h.scatter(red, labels=data.columns[:-1], title='TSNE plot across replicates', radi=10, colors=[col[i.split('-')[2]] for i in data.columns[:-1]], folder= "../results/"+project+"/plots/"+version+"_")
INFO:bokeh.io.state:Session output file '../results/RNPv2/plots/scaling_2_TSNE_plot_across_replicates_scatter.pdf' already exists, will be overwritten.
Out[195]:
Figure(
id = '10315', …)

mr129-MYC-r4 seems weird

GSEA analysis

In [ ]:
data
In [187]:
res = {}
In [188]:
experiments
Out[188]:
['RNP_IRF2BP2',
 'RNP_MYB',
 'RNP_LYL1',
 'RNP_MAX',
 'RNP_IRF8',
 'RNP_GFI1',
 'RNP_ZEB2',
 'RNP_ZMYND8',
 'RNP_SP1',
 'RNP_SPI1',
 'RNP_RUNX2',
 'RNP_MEF2C',
 'RNP_HOXA9',
 'RNP_MYC',
 'RNP_IKZF1',
 'RNP_RUNX1',
 'RNP_CEBPA',
 'RNP_MEF2D',
 'RNP_ELF2',
 'RNP_FLI1',
 'RNP_MEIS1',
 'RNP_MYBL2',
 'RNP_LMO2']
In [189]:
res
Out[189]:
{}
In [199]:
for val in experiments:
    print(val)
    totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
    cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
    if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
        print("rescaling this one")
        cols = [i for i in totest.columns if val+'-' in i]
        totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
    res[val] = gseapy.gsea(data=totest, gene_sets='WikiPathways_2013', 
                cls= cls, no_plot=False, processes=8)
    res[val].res2d['Term'] = [i for i in res[val].res2d.index]
    sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size").set_title(val)
RNP_IRF2BP2
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_MYB
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_LYL1
RNP_GFI1
RNP_ZEB2
RNP_ZMYND8
RNP_SP1
RNP_SPI1
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_RUNX2
RNP_MEF2C
RNP_HOXA9
RNP_MYC
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_IKZF1
RNP_RUNX1
RNP_CEBPA
RNP_MEF2D
RNP_ELF2
RNP_FLI1
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_MEIS1
RNP_MYBL2
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_LMO2
In [208]:
with open('../data/'+project+'/'+version+'_wikipathway_RNPv2', 'wb') as f:
    pickle.dump(res,f)
In [ ]:
with open('../data/'+project+'/'+version+'_wikipathway_RNPv2','rb') as f:
    res = pickle.load(f)

Analysis on the wiki pathways geneset

In [207]:
for val in experiments:
    res[val].res2d['Term'] = [i[3:].split('WP')[0] for i in res[val].res2d['Term']]
    a = sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size")
    a.set_title(val)
    a.figure.savefig('../results/'+ project+ '/plots/'+version + '_wikipathway_mainTerms.pdf')
    plt.show()
In [209]:
a = set()
for k, val in res.items():
    a.update(set(val.res2d.index))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
    for i,v in val.res2d.iterrows():
        a[i][n] = v.es
In [210]:
res = pd.DataFrame(a, index=res.keys())
In [211]:
res.columns = [i[3:].split('WP')[0] for i in res.columns]
In [212]:
res.index = [i.split('_')[1] for i in res.index]
In [213]:
fig = sns.clustermap(figsize=(25,20), data=res,vmin=-1,vmax=1,xticklabels=res.columns, yticklabels=res.index)
fig.savefig("../results/"+project+"/"+version+"enriched_terms_scaled_gsea.pdf")
In [214]:
res.to_csv('../results/'+project+'/'+version+'_wikipathway_gsea_matrix.csv')
In [5]:
res = {}
In [4]:
res.keys()
-----------------------------------------------------------
NameError                 Traceback (most recent call last)
<ipython-input-4-fa6bd5b552ae> in <module>
----> 1 res.keys()

NameError: name 'res' is not defined

Analysis on the entire set of pathways (biopathways)

In [10]:
for i, val in enumerate(experiments):
    print(val)
    totest = data[[v for v in data.columns[:-1] if val+'-' in v or 'AAVS1' in v]]
    cls = ['Condition' if val+'-' in v else 'DMSO' for v in totest.columns]
    if abs(scaling[val.split('_')[1]][0]) > 3*scaling[val.split('_')[1]][1]:
        print("rescaling this one")
        cols = [i for i in totest.columns if val+'-' in i]
        totest[cols] = totest[cols]*(2**scaling[val.split('_')[1]][0])
    elif val in res:
        continue
    res[val] = gseapy.gsea(data=totest, gene_sets='GO_Biological_Process_2015', 
                cls= cls, no_plot=False, processes=8)
    res[val].res2d['Term'] = [i for i in res[val].res2d.index]
    plt.figure(i)
    a = sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
                hue_order="geneset_size")
    a.set_title(val)
    a.figure.savefig('../results/'+ project+ '/plots/'+version + '_gobioproc2015_mainTerms_'+val+'.pdf')
RNP_LYL1
RNP_SPI1
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_ZEB2
RNP_SP1
RNP_HOXA9
RNP_RUNX2
RNP_MYBL2
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_RUNX1
RNP_FLI1
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_MEF2C
RNP_IRF8
RNP_MYB
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_MEIS1
RNP_GFI1
RNP_MYC
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_MEF2D
RNP_IKZF1
RNP_IRF2BP2
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_ZMYND8
RNP_MAX
rescaling this one
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2963: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
RNP_ELF2
<ipython-input-10-7735467c8df0>:14: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  plt.figure(i)
RNP_CEBPA
<ipython-input-10-7735467c8df0>:14: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  plt.figure(i)
RNP_LMO2
<ipython-input-10-7735467c8df0>:14: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  plt.figure(i)
In [11]:
with open('../results/'+project+'/'+version+'_GO_Biological_Process_2015_RNPv2', 'wb') as f:
    pickle.dump(res,f)
In [ ]:
with open('../results/'+project+'/'+version+'_GO_Biological_Process_2015_RNPv2', 'wb') as f:
    res = pickle.load(f)
In [12]:
for i, v in res.items():
    res[i].res2d['Term'] = [i.split('(GO')[0] for i in v.res2d['Term']]

creating matrices

In [13]:
a = set()
for k, val in res.items():
    a.update(set(val.res2d.Term))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
    for i,v in val.res2d.iterrows():
        a[v.Term][n] = v.es
res = pd.DataFrame(a, index=res.keys())
In [14]:
fig = sns.clustermap(figsize=(25,20), data=res,vmin=-1,vmax=1, yticklabels=res.index ,cmap=plt.cm.RdYlBu)
fig.savefig("../results/"+project+"/"+version+"enriched_terms_scaled_gsea_bioproc2015.pdf")
------------------------------------------------------------------
NameError                        Traceback (most recent call last)
<ipython-input-14-d951df3cdd71> in <module>
      1 fig = sns.clustermap(figsize=(25,20), data=res,vmin=-1,vmax=1, yticklabels=res.index ,cmap=plt.cm.RdYlBu)
----> 2 fig.savefig("../results/"+folder+"/"+version+"enriched_terms_scaled_gsea.pdf")

NameError: name 'folder' is not defined
In [18]:
model = DBSCAN(eps=0.5, min_samples=3, metric='euclidean')
labels = model.fit_predict(res)
In [19]:
sort = labels.argsort()

Correlation matrix across experiment on at the gene set level

In [16]:
a = sns.clustermap(-res.T.corr(),cmap=plt.cm.RdYlBu,vmin=-1,vmax=1)
a.savefig("../results/"+project+"/plots/"+version+"_clustermap_over_bioproc2015_geneset.pdf")
In [32]:
a = h.plotCorrelationMatrix(res.values[sort], res.index[sort].tolist(), interactive=True, title="correlation matrix over bioproc2015 gene sets", folder="../results/"+project+"/plots/"+version+"_")#,colors=[labels[i] for i in sort]) 
BokehUserWarning: ColumnDataSource's columns must be of the same length. Current lengths: ('alphas', 529), ('colors', 529), ('data', 23), ('xname', 529), ('yname', 529)

similarity distance plot over the genesets

In [70]:
red = PCA(2).fit_transform(res)
h.scatter(red, labels=res.index, radi=1, colors=labels, title="PCA distance in geneset space", showlabels=True,folder="../results/"+project+"/plots/"+version+"_")
INFO:bokeh.io.state:Session output file '../results/RNPv2/plots/scaling_2_PCA_distance_in_geneset_space_scatter.pdf' already exists, will be overwritten.
Out[70]:
Figure(
id = '6933', …)
In [69]:
red = TSNE(2,2).fit_transform(res)
h.scatter(red, labels=res.index, radi=9, colors=labels, title="TSNE distance in geneset space",showlabels=True,folder="../results/"+project+"/plots/"+version+"_")
INFO:bokeh.io.state:Session output file '../results/RNPv2/plots/scaling_2_TSNE_distance_in_geneset_space_scatter.pdf' already exists, will be overwritten.
Out[69]:
Figure(
id = '6532', …)
In [43]:
res.to_csv('../results/'+project+'/'+version+'_biopathway_gsea.csv')
In [ ]:
res = pd.read_csv('../results/'+project+'/'+version+'_biopathway_gsea.csv',index_col=0)

Getting the correlation at the transcriptome level

In [53]:
data = pd.DataFrame(index=results['RNP_SP1'].index.tolist())
for i, v in results.items():
    data[i]=v.log2FoldChange
In [54]:
model = AgglomerativeClustering(n_clusters=8,linkage="average", 
                                affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(data.values.T)
ii = itertools.count(data.values.shape[1])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
In [56]:
model = DBSCAN(eps=0.5, min_samples=3, metric='euclidean')
labels = model.fit_predict(data.values)
In [58]:
a = sns.clustermap(data.corr())
a.savefig("../results/"+project+"/plots/"+version+"_clustermap_correlation_transcriptome.pdf")
In [59]:
a = h.plotCorrelationMatrix(data.values.T[sort], data.columns[sort].tolist(), interactive=True, title="transcriptome correlation",folder= "../results/"+project+"/plots/"+version+"_")#,colors=[labels[i] for i in sort]) 
BokehUserWarning: ColumnDataSource's columns must be of the same length. Current lengths: ('alphas', 529), ('colors', 529), ('data', 23), ('xname', 529), ('yname', 529)
In [ ]:
## Filtered version (set to 0 genes with low p_value)
In [60]:
data = pd.DataFrame(index=results['RNP_SP1'].index.tolist())
for i, v in results.items():
    v.loc[v[v.pvalue>0.01].index,"log2FoldChange"]==0
    data[i]=v.log2FoldChange
In [61]:
a = h.plotCorrelationMatrix(data.values.T[sort], data.columns[sort].tolist(), interactive=True, title="transcriptome correlation matrix over high pvalue genes",folder="../results/"+project+"/plots/"+version+"_")
BokehUserWarning: ColumnDataSource's columns must be of the same length. Current lengths: ('alphas', 529), ('colors', 529), ('data', 23), ('xname', 529), ('yname', 529)
INFO:bokeh.io.state:Session output file '../results/RNPv2/plots/scaling_2_transcriptome_correlation_correlation.pdf' already exists, will be overwritten.
In [62]:
mostvar = data[(~data.index.str.contains('ERCC-')) & (data.var(1)>4)]
In [63]:
a = sns.clustermap(-mostvar.corr(), cmap=plt.cm.RdYlBu, vmin=-1, vmax=1)
a.savefig("../results/"+project+"/plots/"+version+"_clustermap_mostvariable_genes.pdf")

Doing it at the CRC members' expression only

In [82]:
results_ctf = data.loc[set(['MYC',
 'MYB',
 'SPI1',
 'RUNX1',
 'IRF2BP2',
 'FLI1',
 'ELF2',
 'ZEB2',
 'GFI1',
 'LMO2',
 'CEBPa',
 'MEF2D',
 'MEF2C',
 'IRF8',
 'MEIS1',
 'RUNX2',
 'ZMYND8']) & set(data.index)].T
model = AgglomerativeClustering(n_clusters=7,linkage="average", 
                                affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(results_ctf)
ii = itertools.count(results_ctf.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
h.plotCorrelationMatrix(results_ctf.values[sort], title="correlation matrix over similarity over CRC members aggregated with agglomerative clustering", folder="../results/"+project+'/plots/'+version+'_',names=results_ctf.index[sort].tolist(),interactive=True)
BokehUserWarning: ColumnDataSource's columns must be of the same length. Current lengths: ('alphas', 529), ('colors', 529), ('data', 23), ('xname', 529), ('yname', 529)
Out[82]:
Figure(
id = '8098', …)
In [80]:
a = sns.clustermap(results_ctf.corr())
a.set_title("clustermap of correlation matrix over similarity over CRC members")
a.figure.savefig('../results/'+project+'/plots/'+version+"_clustermap_correlation_CRConly.pdf")
Out[80]:
<seaborn.matrix.ClusterGrid at 0x7fcc11171a30>
In [92]:
a.fig.savefig('../results/'+project+'/plots/'+version+"_pairplot_deseq_crconly.pdf")
In [91]:
a = sns.pairplot(results_ctf.T)
a.fig.suptitle("pairplot of differential expression of CRC members under RNP of these members", y=1.08)
------------------------------------------------------------------
AttributeError                   Traceback (most recent call last)
<ipython-input-91-dfb9ee737b22> in <module>
      1 a = sns.pairplot(results_ctf.T)
      2 a.fig.suptitle("pairplot of differential expression of CRC members under RNP of these members", y=1.08)
----> 3 a.figure.savefig('../results/'+project+'/plots/'+version+"_pairplot_deseq_crconly.pdf")

AttributeError: 'PairGrid' object has no attribute 'figure'
In [ ]: